This R script is used to merge RS data downloaded the Google Drive folder shared by Bea with the ReSurvey database.
library(tidyverse)
library(here)
library(lubridate)
# Set the folder path
folder_path <- "C:/Data/MOTIVATE/MOTIVATE_RS_data/S2"
# List only CSV files that contain "max_Filtered" in their filename
csv_files <- list.files(folder_path,
pattern = "Sentinel_Mean_Plot_Year_Allmetrics.*\\.csv$",
full.names = TRUE, recursive = TRUE)
# So far remove this one that is wrong
csv_files <- setdiff(csv_files,
"C:/Data/MOTIVATE/MOTIVATE_RS_data/S2/ALP/New_filter/ALP_BAL_Sentinel_Mean_Plot_Year_Allmetrics.csv")
# Function to read each file and extract info from the filename
read_and_label <- function(file_path) {
file_name <- basename(file_path)
# Extract region and subregion from the filename
# Updated regular expression to handle the case where subregion is missing
components <- str_match(file_name,
"^[0-9_]*([^_]+)_([^_]+)_Sentinel.*?_(.*?).csv")
# Extract the biogeo and unit, handling missing subregion (unit)
biogeo <- components[2]
# If subregion (unit) is missing, set it as NA
unit <- ifelse(is.na(components[3]), NA, components[3])
# Check if biogeo is missing, and if so,
# assign the first part of the filename (region name)
if (is.na(biogeo) && grepl("Sentinel", file_name)) {
# Capture the first part (biogeo) directly
biogeo <- str_match(file_name, "^[0-9_]*([^_]+)_Sentinel")[2]
}
# If biogeo is still NA, print a warning
if (is.na(biogeo)) {
warning(paste("Failed to extract biogeo for file:", file_name))
}
# Read CSV and add columns for extracted info
read_csv(file_path) %>%
mutate(biogeo = biogeo, unit = unit)
}
# Read and merge all CSV files
data_RS_S2 <- map_dfr(csv_files, read_and_label)
Rows: 2442 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 64 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 689 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 1504 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 2784 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 1 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 339 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 980 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 2948 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 175 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 1070 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 2345 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 247 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 32274 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 2569 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 358 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 762 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 15 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 31 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 2979 Columns: 38── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): system:index, source, .geo
dbl (35): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, Lat_update, Lon_update, NDMI_max,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the resulting tibble
print(data_RS_S2)
# Counts per biogeo and unit
print(data_RS_S2 %>% count(biogeo, unit), n = 100)
Keep all indices and metrics in case they are useful.
data_RS_S2 <- data_RS_S2 %>%
# Keep the columns we need
select(obs_unique, biogeo, unit, year, source, Lat_update, Lon_update,
starts_with("NDVI"), starts_with("NDMI"), starts_with("NDWI"),
starts_with("EVI"), starts_with("SAVI")) %>%
# Rename Lat and Lon, these are only kept in case there is difference with
# those in the ReSurvey database due to updates based on Ilona's info
rename(Lat_RS = Lat_update, Lon_RS = Lon_update) %>%
# Same for year
rename(year_RS = year)
Manually renamed all files as REGION_SUBREGION_Phenology.
# Set the folder path
folder_path <- "C:/Data/MOTIVATE/MOTIVATE_RS_data/S2"
# List only CSV files that contain "Phenology" in their filename
csv_files <- list.files(folder_path, pattern = "Phenology.*\\.csv$",
full.names = TRUE, recursive = TRUE)
# Function to read each file and extract info from the filename
read_and_label <- function(file_path) {
file_name <- basename(file_path)
# Extract region and subregion from the filename
# Updated regular expression to handle the case where subregion is missing
components <- str_match(file_name,
"^[0-9_]*([^_]+)_([^_]+)_Phenology.csv")
# Extract the biogeo and unit, handling missing subregion (unit)
biogeo <- components[2]
# If subregion (unit) is missing, set it as NA
unit <- ifelse(is.na(components[3]), NA, components[3])
# Check if biogeo is missing, and if so,
# assign the first part of the filename (region name)
if (is.na(biogeo) && grepl("_Phenology", file_name)) {
# Capture the first part (biogeo) directly
biogeo <- str_match(file_name, "^[0-9_]*([^_]+)_Phenology")[2]
}
# If biogeo is still NA, print a warning
if (is.na(biogeo)) {
warning(paste("Failed to extract biogeo for file:", file_name))
}
# Read CSV and add columns for extracted info
read_csv(file_path) %>%
mutate(biogeo = biogeo, unit = unit)
}
# Read and merge all CSV files
data_RS_S2_phen <- map_dfr(csv_files, read_and_label)
Rows: 2442 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 268 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 64 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 689 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 2784 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 1 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 339 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 980 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 2948 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 175 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 1070 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 2345 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 247 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 358 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 762 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 15 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 31 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 2979 Columns: 15── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (13): EOS_DOY, Lat_update, Lon_update, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Pe...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the resulting tibble
print(data_RS_S2_phen)
# Counts per biogeo and unit
print(data_RS_S2_phen %>% count(biogeo, unit), n = 100)
data_RS_S2_phen <- data_RS_S2_phen %>%
# Keep the columns we need:
# Dates were wrongly exported from GEE - removed them and recalculated later.
# Remove Lat and Lon and year, in case there is difference with
# those in the ReSurvey database due to updates based on Ilona's info,
# we have Lat_RS, Lon_RS and year_RS from data_RS_S2
select(obs_unique, biogeo, unit, SOS_DOY, NDVI_at_SOS, Peak_DOY, NDVI_at_Peak,
EOS_DOY, NDVI_at_EOS, Season_Length, year) %>%
# make_date() creates a date at the start of the year (January 1st).
# By adding days(DOY - 1), we correctly position the date within the year.
mutate(SOS_date = make_date(year) + days(SOS_DOY - 1),
Peak_date = make_date(year) + days(Peak_DOY - 1),
EOS_date = make_date(year) + days(EOS_DOY - 1)) %>%
# Remove year
select(-year)
# Set the folder path
folder_path <- "C:/Data/MOTIVATE/MOTIVATE_RS_data/Landsat"
# List only CSV files that contain "Plot" in their filename
csv_files <- list.files(folder_path, pattern = "Plot.*\\.csv$",
full.names = TRUE, recursive = TRUE)
# Remove ALP_BAL so far cause there seems to be an error in that table
csv_files <- csv_files[-1]
# Define the expected column names
expected_columns <- c("system:index", "Lat_update", "Lon_update", "obs_unique",
"plot_uniqu", "source", "year", "EVI_max", "EVI_median",
"EVI_min", "EVI_p10", "EVI_p90", "EVI_stdDev", "EVImean",
"NDMI_max", "NDMI_median", "NDMI_min", "NDMI_p10",
"NDMI_p90", "NDMI_stdDev", "NDMImean", "NDVI_max",
"NDVI_median", "NDVI_min", "NDVI_p10", "NDVI_p90",
"NDVI_stdDev", "NDVImean", "NDWI_max", "NDWI_median",
"NDWI_min", "NDWI_p10", "NDWI_p90", "NDWI_stdDev",
"NDWImean", "SAVI_max", "SAVI_median", "SAVI_min",
"SAVI_p10", "SAVI_p90", "SAVI_stdDev", "SAVImean", ".geo")
# Define the column types
column_types <- cols(
`system:index` = col_character(), Lat_update = col_double(),
Lon_update = col_double(), obs_unique = col_double(),
plot_uniqu = col_character(), source = col_character(), year = col_integer(),
EVI_max = col_double(), EVI_median = col_double(), EVI_min = col_double(),
EVI_p10 = col_double(), EVI_p90 = col_double(), EVI_stdDev = col_double(),
EVImean = col_double(), NDMI_max = col_double(), NDMI_median = col_double(),
NDMI_min = col_double(), NDMI_p10 = col_double(), NDMI_p90 = col_double(),
NDMI_stdDev = col_double(), NDMImean = col_double(), NDVI_max = col_double(),
NDVI_median = col_double(), NDVI_min = col_double(), NDVI_p10 = col_double(),
NDVI_p90 = col_double(), NDVI_stdDev = col_double(), NDVImean = col_double(),
NDWI_max = col_double(), NDWI_median = col_double(), NDWI_min = col_double(),
NDWI_p10 = col_double(), NDWI_p90 = col_double(), NDWI_stdDev = col_double(),
NDWImean = col_double(), SAVI_max = col_double(), SAVI_median = col_double(),
SAVI_min = col_double(), SAVI_p10 = col_double(), SAVI_p90 = col_double(),
SAVI_stdDev = col_double(), SAVImean = col_double(), .geo = col_character()
)
# Function to read each file and extract info from the filename
read_and_label <- function(file_path) {
file_name <- basename(file_path)
# Extract region and subregion from the filename
# Updated regular expression to handle the case where subregion is missing
components <- str_match(file_name,
"^[0-9_]*([^_]+)_([^_]+)_Landsat.*?_(.*?).csv")
# Extract the biogeo and unit, handling missing subregion (unit)
biogeo <- components[2]
# If subregion (unit) is missing, set it as NA
unit <- ifelse(is.na(components[3]), NA, components[3])
# Check if biogeo is missing, and if so,
# assign the first part of the filename (region name)
if (is.na(biogeo) && grepl("Landsat", file_name)) {
# Capture the first part (biogeo) directly
biogeo <- str_match(file_name, "^[0-9_]*([^_]+)_Landsat")[2]
}
# If biogeo is still NA, print a warning
if (is.na(biogeo)) {
warning(paste("Failed to extract biogeo for file:", file_name))
}
delimiter <- ifelse(grepl(";", readLines(file_path, n = 1)), ";", ",")
# Read CSV and add columns for extracted info
data <- read_delim(file_path, delim = delimiter, col_types = column_types) %>%
mutate(biogeo = biogeo, unit = unit)
# Reorder columns based on expected columns
data <- data %>%
select(all_of(expected_columns), everything())
return(data)
}
# Read and merge all CSV files
data_RS_Landsat <- map_dfr(csv_files, read_and_label)
# View the resulting tibble
print(data_RS_Landsat)
# Counts per biogeo and unit
print(data_RS_Landsat %>% count(biogeo, unit), n = 100)
Keep all indices and metrics in case they are useful.
data_RS_Landsat <- data_RS_Landsat %>%
# Keep the columns we need
select(obs_unique, biogeo, unit, year, source, Lat_update, Lon_update,
starts_with("NDVI"), starts_with("NDMI"), starts_with("NDWI"),
starts_with("EVI"), starts_with("SAVI")) %>%
# Rename Lat and Lon, these are only kept in case there is difference with
# those in the ReSurvey database due to updates based on Ilona's info
rename(Lat_RS = Lat_update, Lon_RS = Lon_update) %>%
# Same for year
rename(year_RS = year)
data_RS_CH <- read_csv(
"C:/Data/MOTIVATE/MOTIVATE_RS_data/Canopy_Height_1m/Europe_points_CanopyHeight_1m.csv")
Rows: 425310 Columns: 8── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system:index, .geo
dbl (6): Lat_update, Lon_update, canopy_height, obs_unique, plot_uniqu, year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_RS_CH
data_RS_CH <- data_RS_CH %>%
# Keep the columns we need
select(obs_unique, canopy_height)
In this file, there is the correspondence obs_unique - PlotObservationID.
db_Europa <- read_csv(
here("..", "DB_first_check", "data", "clean","db_Europa_20250107.csv")
)
Rows: 425310 Columns: 12── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): Country, RS_CODE, ReSurvey site, ReSurvey plot, Expert System, Location method
dbl (6): PlotObservationID, Lon_updated, Lat_updated, plot_unique_id, year, obs_unique_id
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Get only the columns PlotObservationID (original unique identifier) obs_unique_id (unique identified created by me) and year.
db_Europa <- db_Europa %>% select(PlotObservationID, obs_unique_id)
data_RS_S2_ID <- db_Europa %>%
right_join(data_RS_S2 %>%
# Rename to be able to join on this column
rename(obs_unique_id = obs_unique))
Joining with `by = join_by(obs_unique_id)`
Now we have PlotObservationID in data_RS_S2_ID.
data_RS_S2_phen_ID <- db_Europa %>%
right_join(data_RS_S2_phen %>%
# Rename to be able to join on this column
rename(obs_unique_id = obs_unique))
Joining with `by = join_by(obs_unique_id)`
Now we have PlotObservationID in data_RS_S2_phen_ID
data_RS_Landsat_ID <- db_Europa %>%
right_join(data_RS_Landsat %>%
# Rename to be able to join on this column
rename(obs_unique_id = obs_unique))
Joining with `by = join_by(obs_unique_id)`
Now we have PlotObservationID in data_RS_Landsat_ID.
data_RS_CH_ID <- db_Europa %>%
right_join(data_RS_CH %>%
# Rename to be able to join on this column
rename(obs_unique_id = obs_unique))
Joining with `by = join_by(obs_unique_id)`
Now we have PlotObservationID in data_RS_CH_ID.
This is the ReSurvey database after updates (to be continued).
db_resurv <- read_tsv(
here("..", "DB_first_check","data", "clean","db_resurv_updated_clean.csv"),
col_types = cols(
# Dynamically specify EUNIS columns as character
.default = col_guess(), # Default guessing for other columns
EUNISa = col_character(),
EUNISb = col_character(),
EUNISc = col_character(),
EUNISd = col_character(),
EUNISa_1 = col_character(),
EUNISa_2 = col_character(),
EUNISa_3 = col_character(),
EUNISa_4 = col_character(),
EUNISb_1 = col_character(),
EUNISb_2 = col_character(),
EUNISb_3 = col_character(),
EUNISb_4 = col_character(),
EUNISc_1 = col_character(),
EUNISc_2 = col_character(),
EUNISc_3 = col_character(),
EUNISc_4 = col_character(),
EUNISd_1 = col_character(),
EUNISd_2 = col_character(),
EUNISd_3 = col_character(),
EUNISd_4 = col_character(),
EUNISa_1_descr = col_character(),
EUNISb_1_descr = col_character(),
EUNISc_1_descr = col_character(),
EUNISd_1_descr = col_character(),
EUNIS_assignation = col_character(),
EUNISa_2_descr = col_character(),
EUNISa_3_descr = col_character(),
EUNISa_4_descr = col_character(),
EUNISb_2_descr = col_character(),
EUNISb_3_descr = col_character(),
EUNISb_4_descr = col_character(),
EUNISc_2_descr = col_character(),
EUNISc_3_descr = col_character(),
EUNISc_4_descr = col_character(),
EUNISd_2_descr = col_character(),
EUNISd_3_descr = col_character(),
EUNISd_4_descr = col_character()
)
)
No parsing issues!
For some points, there is data both from S2 and Landsat. In those cases, use the S2 data because it is more precise (10 m vs 30 m).
data_RS_S2_ID <- data_RS_S2_ID %>%
rename_with(~ paste0(., "_S2"), starts_with("NDVI")) %>%
rename_with(~ paste0(., "_S2"), starts_with("NDMI")) %>%
rename_with(~ paste0(., "_S2"), starts_with("NDWI")) %>%
rename_with(~ paste0(., "_S2"), starts_with("EVI")) %>%
rename_with(~ paste0(., "_S2"), starts_with("SAVI")) %>%
select(-source)
data_RS_Landsat_ID <- data_RS_Landsat_ID %>%
rename_with(~ paste0(., "_Landsat"), starts_with("NDVI")) %>%
rename_with(~ paste0(., "_Landsat"), starts_with("NDMI")) %>%
rename_with(~ paste0(., "_Landsat"), starts_with("NDWI")) %>%
rename_with(~ paste0(., "_Landsat"), starts_with("EVI")) %>%
rename_with(~ paste0(., "_Landsat"), starts_with("SAVI")) %>%
select(-source)
Join S2, S2_phen and Landsat data:
data_RS <- data_RS_S2_ID %>%
full_join(data_RS_S2_phen_ID) %>%
full_join(data_RS_Landsat_ID)
Joining with `by = join_by(PlotObservationID, obs_unique_id, biogeo, unit)`Joining with `by = join_by(PlotObservationID, obs_unique_id, biogeo, unit, year_RS, Lat_RS, Lon_RS)`
Number of observations with NDVI_max data from both S2 and Landsat:
nrow(data_RS %>% filter(!is.na(NDVI_max_S2) & !is.na(NDVI_max_Landsat)))
[1] 939
Difference between NDVI_max values from S2 and Landsat:
data_RS %>% filter(!is.na(NDVI_max_S2) & !is.na(NDVI_max_Landsat)) %>%
mutate(diff_NDVI_max = abs(NDVI_max_S2 - NDVI_max_Landsat)) %>%
ggplot(aes(x = diff_NDVI_max, fill = paste(biogeo, unit, sep = "-"))) +
geom_histogram(color = "black") +
facet_wrap(~ paste(biogeo, unit, sep = "-")) + theme(legend.position = "none")
data_RS %>% filter(!is.na(NDMI_max_S2) & !is.na(NDMI_max_Landsat)) %>%
mutate(diff_NDMI_max = abs(NDMI_max_S2 - NDMI_max_Landsat)) %>%
ggplot(aes(x = diff_NDMI_max, fill = paste(biogeo, unit, sep = "-"))) +
geom_histogram(color = "black") +
facet_wrap(~ paste(biogeo, unit, sep = "-")) + theme(legend.position = "none")
There is a large difference between NDVI values from S2 and Landsat. So far, use the S2 data, but checking with Bea / Jose.
When values are available from both satellites, use S2:
data_RS <- data_RS %>%
mutate(across(
matches("^(NDVI|NDMI|NDWI|EVI|SAVI)_(max|median|min|mode|p10|p90)_S2$"),
~ case_when(
# If both the current column and the corresponding Landsat column are NA,
# set to NA_real_
is.na(.x) & is.na(get(sub("_S2$", "_Landsat", cur_column()))) ~ NA_real_,
# If the corresponding Landsat column is NA, use the current column's value
is.na(get(sub("_S2$", "_Landsat", cur_column()))) ~ .x,
# If the current column is NA, use the corresponding Landsat column's value
is.na(.x) ~ get(sub("_S2$", "_Landsat", cur_column())),
# Otherwise, use the current column's value
TRUE ~ .x
), .names = "{col}_combined")) %>%
rename_with(~ sub("_S2_combined$", "", .), matches("_S2_combined$"))
db_resurv_RS <- db_resurv %>%
left_join(data_RS %>% select(-obs_unique_id)) %>%
left_join(data_RS_CH_ID %>% select(-obs_unique_id)) %>%
mutate(S2_data = !is.na(NDVI_max_S2) & !is.na(NDMI_max_S2),
Landsat_data = !is.na(NDVI_max_Landsat) & !is.na(NDMI_max_Landsat),
CH_data = !is.na(canopy_height),
S2_phen_data = !is.na(SOS_DOY)) %>%
# So far, remove cols for _S2 and _Landsat
select(-matches("_(S2|Landsat)$"))
Joining with `by = join_by(PlotObservationID)`Joining with `by = join_by(PlotObservationID)`
db_resurv_RS %>% count(S2_data)
db_resurv_RS %>% count(Landsat_data)
db_resurv_RS %>% count(CH_data)
db_resurv_RS %>% count(S2_phen_data)
Save clean file for analyses (to be updated continuously due to updates in ReSurvey database and updates on RS data).
write_tsv(db_resurv_RS,here("data", "clean","db_resurv_RS_20250505.csv"))
sessionInfo()
R version 4.4.2 (2024-10-31 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)
Matrix products: default
locale:
[1] LC_COLLATE=Spanish_Spain.utf8 LC_CTYPE=Spanish_Spain.utf8 LC_MONETARY=Spanish_Spain.utf8
[4] LC_NUMERIC=C LC_TIME=Spanish_Spain.utf8
time zone: Europe/Madrid
tzcode source: internal
attached base packages:
[1] stats4 grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] randomForest_4.7-1.2 moreparty_0.4 caret_7.0-1 lattice_0.22-6
[5] ggparty_1.0.0 partykit_1.2-23 libcoin_1.0-10 party_1.3-18
[9] strucchange_1.5-4 sandwich_3.1-1 zoo_1.8-12 modeltools_0.2-23
[13] mvtnorm_1.3-3 ggeffects_2.2.0 car_3.1-3 carData_3.0-5
[17] lmerTest_3.1-3 lme4_1.1-36 Matrix_1.7-1 dtplyr_1.3.1
[21] rnaturalearth_1.0.1 sf_1.0-19 scales_1.3.0 readxl_1.4.3
[25] gridExtra_2.3 here_1.0.1 lubridate_1.9.4 forcats_1.0.0
[29] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
[33] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] rstudioapi_0.17.1 jsonlite_1.8.9 magrittr_2.0.3 TH.data_1.1-3
[5] farver_2.1.2 nloptr_2.1.1 rmarkdown_2.29 vctrs_0.6.5
[9] minqa_1.2.8 terra_1.8-15 htmltools_0.5.8.1 varImp_0.4
[13] cellranger_1.1.0 Formula_1.2-5 pROC_1.18.5 sass_0.4.9
[17] parallelly_1.42.0 bslib_0.9.0 KernSmooth_2.23-24 htmlwidgets_1.6.4
[21] plyr_1.8.9 cachem_1.1.0 mime_0.12 lifecycle_1.0.4
[25] iterators_1.0.14 pkgconfig_2.0.3 R6_2.5.1 fastmap_1.2.0
[29] shiny_1.10.0 rbibutils_2.3 future_1.34.0 digest_0.6.37
[33] numDeriv_2016.8-1.1 colorspace_2.1-1 rprojroot_2.0.4 labeling_0.4.3
[37] timechange_0.3.0 httr_1.4.7 abind_1.4-8 compiler_4.4.2
[41] proxy_0.4-27 bit64_4.6.0-1 withr_3.0.2 backports_1.5.0
[45] DBI_1.2.3 MASS_7.3-61 lava_1.8.1 classInt_0.4-11
[49] ModelMetrics_1.2.2.2 tools_4.4.2 units_0.8-5 httpuv_1.6.15
[53] future.apply_1.11.3 nnet_7.3-19 glue_1.8.0 promises_1.3.2
[57] nlme_3.1-166 inum_1.0-5 checkmate_2.3.2 reshape2_1.4.4
[61] generics_0.1.3 recipes_1.1.1 gtable_0.3.6 tzdb_0.4.0
[65] class_7.3-22 data.table_1.16.4 hms_1.1.3 utf8_1.2.4
[69] coin_1.4-3 foreach_1.5.2 pillar_1.10.1 vroom_1.6.5
[73] later_1.4.1 splines_4.4.2 bit_4.5.0.1 survival_3.7-0
[77] tidyselect_1.2.1 knitr_1.49 reformulas_0.4.0 xfun_0.50
[81] measures_0.3 hardhat_1.4.1 timeDate_4041.110 matrixStats_1.5.0
[85] DT_0.33 phosphoricons_0.2.1 stringi_1.8.4 yaml_2.3.10
[89] boot_1.3-31 shinyWidgets_0.9.0 evaluate_1.0.3 codetools_0.2-20
[93] cli_3.6.3 rpart_4.1.23 xtable_1.8-4 Rdpack_2.6.2
[97] jquerylib_0.1.4 munsell_0.5.1 Rcpp_1.0.14 globals_0.16.3
[101] parallel_4.4.2 rclipboard_0.2.1 gower_1.0.2 listenv_0.9.1
[105] viridisLite_0.4.2 ipred_0.9-15 prodlim_2024.06.25 e1071_1.7-16
[109] crayon_1.5.3 insight_1.0.2 rlang_1.1.5 multcomp_1.4-28